# plp_processing.R --------------------------------------------------------

# Author: vgouirand
# Description: processing / dimensional reduction / cluster calls, etc.
# Input: here::here("input", "NST|PSO")
# Output: here::here("output", "processing")
# Date: 2024_1_11


# Library Import ----------------------------------------------------------

library(conflicted)
library(cowplot)
library(bluster)
library(edgeR)
library(fgsea)
library(harmony)
library(hgnc)
library(here)
# library(lisi)
library(limma)
library(patchwork)
library(pheatmap)
library(msigdbr)
library(psych)
# library(presto)
library(rtracklayer)
library(scDblFinder)
library(SingleCellExperiment)
library(scater)
library(scran)
library(scRepertoire)
library(scuttle)
library(Seurat)
library(tidyverse)
library(Biobase)
library(SeuratObject)
library(ggplot2)
library(ggrepel)
library(R.utils)
library(utf8)
library(patchwork)
library(tidyr)
library(dplyr)

source(here("R", "plp_utils.R"))

filter <- dplyr::filter
Assays <- SeuratObject::Assays

## Upload data# start on plp_raw_processed data made by SC
plp <-readr::read_rds("./plp_raw_processed.rds.gz")

##remove outliers
plp[["sum.mads"]] <- rowSums(
  x = plp[[]][, sapply(plp[[]], class) == "logical"], # e.g., endsWith(suffix = "_MAD")
  na.rm = TRUE
)
plp <- plp[, plp$sum.mads == 0]

##remove blood and normalization
plp <- plp[, plp$tissue=="Skin"]
plp <- NormalizeData(object = plp)

plp_sr <- 
  plp[["RNA"]][[]] %>% 
  filter(
    seqid %in% factor(c(1:22, "X")), # != c("Y", "MT", clone-based genes, etc.)
    !grepl(
      pattern = "antisense|^((TR)|(IG))_((V)|((C|D|J)_pseudo))",
      x = gene_biotype
    ), 
    !grepl(
      pattern = paste0("(-M)?RP(S|L)|MT-"),
      x = gene_name
    )
  ) %>% 
  pull(gene_name)

# Model Gene Variance
plp_var <- modelGeneVar(
  x = GetAssayData(object = plp, assay = "RNA", slot = "data"),
  design = model.matrix(~ patient, data = plp[[]]),
  subset.row = plp_sr,
  subset.fit = plp_sr
)

# Variable Features
VariableFeatures(plp) <- getTopHVGs(
  stats = plp_var, 
  var.field = "bio", 
  n = 3000
)

# Scale Data
plp <- ScaleData(object = plp, features = VariableFeatures(object = plp))

# PCA
plp <- RunPCA(object = plp, npcs = 50)

# Harmony 
# github.com/immunogenomics/harmony/issues/130
plp <- RunHarmony(
  object = plp,
  reduction = "pca",
  reduction.save = "harmony",
  group.by.vars = c("patient"),
  max.iter.harmony = 10,
  max.iter.cluster = 20,
  epsilon.harmony = -Inf,
  epsilon.cluster = -Inf,
  plot_convergence = TRUE, 
  verbose = TRUE
)

# UMAP
plp <- RunUMAP(
  object = plp, 
  reduction = "harmony", 
  dims = 1:50, 
  verbose = TRUE
)
DimPlot(plp, group.by = "patient")


# FindNeighbors
plp <- FindNeighbors(
  object = plp, 
  reduction = "harmony", 
  dims = 1:50
)

# FindClusters
plp <- FindClusters(
  object = plp, 
  method = "igraph",
  algorithm = 4L,
  resolution = seq(from = 0.1, to = 1, by = 0.1)
)

# Cluster Identification --------------------------------------------------

plp <- read_rds(file = here("output", "processing", "plp_processed2.rds.gz")) 

plp_resolutions <- 
  FetchData(
    object = plp,
    vars = c(
      "UMAP_1",
      "UMAP_2",
      "patient",
      grep(
        pattern = "^RNA_snn_res.",
        x = colnames(plp[[]]),
        value = TRUE
      )
    )
  ) %>% 
  pivot_longer(
    cols = starts_with("RNA"),
    names_to = "resolution",
    values_to = "cluster"
  ) %>%
  ggplot(aes(x = UMAP_1, y = UMAP_2)) + 
  scattermore::geom_scattermore(aes(color = cluster)) + 
  ggrepel::geom_text_repel(
    data = . %>% 
      group_by(resolution, cluster) %>% 
      summarize(
        UMAP_1 = median(UMAP_1),
        UMAP_2 = median(UMAP_2)
      ),
    aes(label = cluster),
    size = 6 / ggplot2:::.pt,
    min.segment.length = 0
  ) + 
  facet_wrap(~ resolution) + 
  theme(legend.position = "none")



## Feature plot
Idents(object = plp) <- "RNA_snn_res.0.4"
DimPlot(object = plp)
plp<- BuildClusterTree(plp)
PlotClusterTree(plp)
plp_markers <- FindAllMarkers(plp, only.pos = TRUE)
plp_markers %>%
  group_by(cluster) %>%
  dplyr::filter(avg_log2FC > 1)%>%
  slice_head(n = 10) %>%
  ungroup() -> top10
DoHeatmap(plp, features = top10$gene) + NoLegend()

# export
write_rds(
  x = plp_markers,
  file = here("output", "processing", "plp_markers.rds.gz"),
  compress = "gz"
)
plp_markers <- read_rds(file = here("output", "processing", "plp_markers.rds.gz")) 


plp<-RenameIdents(plp, 
                  "1"="CD4+ Tregs",
                  "2"="CD8+ T cells",
                  "3"="CD8+ T cells",
                  "4"="CD4+ Teffs",
                  "5"="CD4+ Teffs",
                  "6"="Myeloid cells",
                  "7"="NK cells",
                  "8"="NK cells",
                  "9"="Neutrophils",
                  "10"="Mast cells",
                  "11"="B cells",
                  "12"="CD4+ Tregs",
                  "13"= "Proliferating cells")

plp <- StashIdent(object = plp, save.name = "cluster_names")
plp$cluster_names <- factor(
  x = plp$cluster_names,
  levels = c(
    "CD4+ Teffs", "CD4+ Tregs", "CD8+ T cells", "NK cells",
    "Neutrophils", "Myeloid cells","Mast cells",
    "B cells",
    "Proliferating cells"
  )
)

Idents(object = plp) <- plp$cluster_names

#feature plot for each cluster markers
FeaturePlot(plp, features = c("FOXP3","CD8A","HLA-DRA", "CD19", "CTLA4", "ITGAM", "ITGAX", "NCAM1"))
#BiocManager::install("Nebulosa")
# KDE - kernel density estimate too make nice plot!
Nebulosa::plot_density(object = plp, reduction = "umap", features = c("LAYN", "FOXP3"))
Nebulosa::plot_density(object = plp, reduction = "umap", features = c("CD8A","CD4", "HLA-DRA", "CD19", "CTLA4", "ITGAM", "ITGAX", "NCAM1"))


FeaturePlot(plp,features = c("FOXP3", "LAYN"))
DotPlot(plp, features = c("LAYN","FOXP3","CD8A","HLA-DRA", "CD19", "CTLA4", "ITGAM", "ITGAX", "NCAM1"))
ggsave(filename = here("output", "processing", "FeaturePlot_layn.pdf"), plot = Featureplot_LAYN)
ggsave(filename = here("output", "processing", "Dotplot_layn.pdf"), plot = Dotplot_layn)
ggsave(filename = here("output", "processing", "Dotplot_broad.pdf"), plot = Dotplot_broad)

DotPlot(object = plp, features = c("LAYN", "FOXP3"))

plp$LAYN_POS<-ifelse(
  test = GetAssayData(object = plp, slot = "counts")["LAYN", ] > 0,
  yes = "YES",
  no = "NO"
)

#subcluster Treg ----------------------------------------------------------

plp_treg <- subset(x = plp, idents = "CD4+ Tregs")

DimPlot(plp_treg,reduction="umap", group.by = "LAYN_POS")
DimPlot_foxp3<-DimPlot(plp_treg,reduction="umap", group.by = "LAYN_POS")
VlnPlot_foxp3<-VlnPlot(object = plp_treg, features = c("LAYN", "FOXP3", "CTLA4"),split.by = "LAYN_POS", split.plot = F)
ggsave(filename = here("output", "processing", "DimPlot_subcluster.pdf"), plot = DimPlot_foxp3, device = "pdf", dpi = 300)
ggsave(filename = here("output", "processing", "VlnPlot_subcluster.pdf"), plot = VlnPlot_foxp3, device = "pdf", dpi = 300)

# Model Gene Variance
plp_sr <- 
  plp_treg[["RNA"]][[]] %>% 
  filter(
    seqid %in% factor(c(1:22, "X")), # != c("Y", "MT", clone-based genes, etc.)
    !grepl(
      pattern = "antisense|^((TR)|(IG))_((V)|((C|D|J)_pseudo))",
      x = gene_biotype
    ), 
    !grepl(
      pattern = paste0("(-M)?RP(S|L)|MT-"),
      x = gene_name
    )
  ) %>% 
  pull(gene_name)

plp_treg_var <- modelGeneVar(
  x = GetAssayData(object = plp_treg, assay = "RNA", slot = "data"),
  design = model.matrix(~ patient, data = plp_treg[[]]),
  subset.row = plp_sr,
  subset.fit = plp_sr
)

# Variable Features
VariableFeatures(plp_treg) <- getTopHVGs(
  stats = plp_treg_var, 
  var.field = "bio", 
  n = 3000
)

# Scale Data
plp_treg <- ScaleData(object = plp_treg, features = VariableFeatures(object = plp_treg))

# PCA
plp_treg <- RunPCA(object = plp_treg, npcs = 50)

# Harmony 
# github.com/immunogenomics/harmony/issues/130
plp_treg <- RunHarmony(
  object = plp_treg,
  reduction = "pca",
  reduction.save = "harmony",
  group.by.vars = c("patient"),
  max.iter.harmony = 10,
  max.iter.cluster = 20,
  epsilon.harmony = -Inf,
  epsilon.cluster = -Inf,
  plot_convergence = TRUE, 
  verbose = TRUE
)


# UMAP
plp_treg <- RunUMAP(
  object = plp_treg, 
  reduction = "harmony", 
  dims = 1:50, 
  verbose = TRUE
)
DimPlot(plp_treg, group.by = "patient")

# FindNeighbors
plp_treg <- FindNeighbors(
  object = plp_treg, 
  reduction = "harmony", 
  dims = 1:50
)

# FindClusters
plp_treg <- FindClusters(
  object = plp_treg, 
  method = "igraph",
  algorithm = 4L,
  resolution = seq(from = 0.1, to = 1, by = 0.1)
)

#Pseudobulk------------
##Pseudobulk analysis on treg subset (plp_treg)--------------
pseudo_plp<- AggregateExpression(plp_treg, group.by = c("patient","disease","LAYN_POS"), assays = "RNA", slot = "counts")
pseudo_plp <- pseudo_plp[['RNA']]
colnames(pseudo_plp) <- as.character(colnames(pseudo_plp)) # Changes colnames to simple text
enframe(pseudo_plp) %>% filter(name=="LAYN") #to view layn 

dge <- DGEList(pseudo_plp)
patient <- gsub("_.*","",colnames(pseudo_plp))
disease <- gsub("^[^_]*_(.*?)_[^_]*$", "\\1", colnames(pseudo_plp), perl = TRUE)
layn_pos <- as.character(gsub(".*_","",colnames(pseudo_plp)))
disease_layn_pos <- paste0(disease,"_",layn_pos)

design <- model.matrix( ~0 + disease_layn_pos+patient)
keep <- filterByExpr(y = dge, design = design)
dge <- dge[keep, , keep.lib.sizes = FALSE]
dge <- calcNormFactors(dge)
vm  <- voom(dge, design = design, plot = FALSE)
# Coefficients not estimable: patientP05 
# Warning message:
#   Partial NA coefficients for 11156 probe(s) 
fit <- lmFit(vm, design = design)


contrasts <- makeContrasts(
  PSO_yes_no=disease_layn_posPSO_YES - disease_layn_posPSO_NO,
  CTRL_yes_no=disease_layn_posCTRL_YES - disease_layn_posCTRL_NO, 
  levels=coef(fit))

fit <- contrasts.fit(fit, contrasts)
fit <- eBayes(fit)

de_result_pseudo_pso <- topTable(fit, coef = "PSO_yes_no", n = Inf, adjust.method = "BH")
de_result_pseudo_pso$log2FC <- de_result_pseudo_pso$logFC / log(2)
de_result_pseudo_pso <- arrange(de_result_pseudo_pso , adj.P.Val)
de_result_pseudo_ctrl <- topTable(fit,coef = "CTRL_yes_no", n = Inf, adjust.method = "BH")
de_result_pseudo_ctrl$log2FC <- de_result_pseudo_ctrl$logFC / log(2)
de_result_pseudo_ctrl <- arrange(de_result_pseudo_ctrl, adj.P.Val)


p1 <- ggplot(de_result_pseudo_pso, aes(x=log2FC, y=-log10(P.Value), col=adj.P.Val < 0.05)) +
  geom_point() +
  scale_colour_manual(values=c('TRUE'="red",'FALSE'="black")) + 
  theme_cowplot() +
  ggtitle("Pseudobulk PSO (Volcano)")

significant_pso_genes <- subset(de_result_pseudo_pso, adj.P.Val < 0.05 & abs(log2FC) >= 0.58)
p1_filtrated <- ggplot(de_result_pseudo_pso, aes(x = log2FC, y = -log10(P.Value))) + 
  geom_point()+
  geom_point(data = significant_pso_genes, aes(col = "Significant"), color = "red") +
  scale_colour_manual(values = c("Significant" = "red", "Not Significant" = "black")) + 
  geom_label_repel(data = as_tibble(significant_pso_genes, rownames = "genes"), aes(label = genes), size = 2, max.overlaps = 10) +
  theme_cowplot() +
  ggtitle("Pseudobulk PSO (Volcano)")
p1_filtrated

##to add layer of gene list made for RNAseq analysis
fgsea_leg <- read_csv(file = here("input", "gene_list.csv"))
significant_pso_genes<- rownames_to_column(significant_pso_genes, var="genes")
filtered_significant_genes <- significant_pso_genes[significant_pso_genes$genes %in% fgsea_leg$hgnc_symbol, ]
p1_filtrated_selected <- ggplot(de_result_pseudo_pso, aes(x = log2FC, y = -log10(P.Value))) + 
  geom_point()+
  geom_point(data = significant_pso_genes, aes(col = "Significant"), color = "red") +
  scale_colour_manual(values = c("Significant" = "red", "Not Significant" = "black")) + 
  geom_label_repel(data= filtered_significant_genes, aes(label = filtered_significant_genes$genes), size = 2) +
  theme_cowplot() +
  ggtitle("Pseudobulk PSO (Volcano)")

#back to analysis and p2= ctrl skin
p2 <- ggplot(de_result_pseudo_ctrl, aes(x=log2FC, y=-log10(P.Value), col=adj.P.Val < 0.05)) +
  geom_point() +
  scale_colour_manual(values=c('TRUE'="red",'FALSE'="black")) + 
  theme_cowplot() +
  ggtitle("Pseudobulk CTRL (Volcano)")

significant_ctrl_genes <- subset(de_result_pseudo_ctrl, adj.P.Val < 0.05 & abs(log2FC) >= 0.58)
p2_filtrated <- ggplot(de_result_pseudo_ctrl, aes(x = log2FC, y = -log10(P.Value))) + 
  geom_point()+
  geom_point(data = significant_ctrl_genes, aes(col = "Significant"), color = "red") +
  scale_colour_manual(values = c("Significant" = "red", "Not Significant" = "black")) + 
  geom_text_repel(data = as_tibble(significant_ctrl_genes, rownames = "genes"), aes(label = genes), size = 2, max.overlaps = 10) +
  theme_cowplot() +
  ggtitle("Pseudobulk CTRL (Volcano)")
p2_filtrated

de_result_joint<- full_join(x= as_tibble(de_result_pseudo_ctrl, rownames =  "genes"), y=as_tibble(de_result_pseudo_pso, rownames = "genes"), suffix = c(".ctrl",".pso"), by= "genes")


selected_genes<- subset(de_result_joint, (adj.P.Val.ctrl <= 0.05 & abs(log2FC.ctrl) >= 0.58) | (adj.P.Val.pso <= 0.05 & abs(log2FC.pso) >= 0.58))
selected_genes$adj.P.Val_ctrl_pso <- ifelse(selected_genes$adj.P.Val.ctrl <= 0.05 | selected_genes$adj.P.Val.pso <= 0.05, "Significant", "Not Significant")

# Plot significant and non-significant genes
ggplot(de_result_joint, aes(x = log2FC.ctrl, y = log2FC.pso)) +
  # Non-significant genes in black
  geom_point(data = de_result_joint[!rownames(de_result_joint) %in% rownames(selected_genes), ], color = "black", alpha = 0.4) +
  # Significant genes
  geom_point(data = selected_genes, aes(color = adj.P.Val_ctrl_pso)) +
  # Vertical and horizontal lines at logFC = 0
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  # Diagonal line
  geom_abline(intercept = 0, slope = 1, linetype = "dotted") +
  # Vertical line at x = 0
  geom_vline(xintercept = 0) +
  # Horizontal line at y = 0
  geom_hline(yintercept = 0) +
  labs(x = "log2FC(pseudo_ctrl)", y = "log2FC(pseudo_pso)", title = "Quadrant Representation of log2FC") +
  geom_label_repel(data = selected_genes, aes(label = genes), size = 2, max.overlaps = 40) +
  theme_cowplot()

# Subset genes with adj.P.Val.ctrl < 0.05 and abs(logFC.ctrl) > 1
genes_ctrl <- subset(selected_genes, adj.P.Val.ctrl <= 0.05 & abs(log2FC.ctrl >= 0.58))$genes
genes_ctrl_df<- data_frame(genes_ctrl)
# Subset genes with adj.P.Val.pso < 0.05 and abs(logFC.pso) > 1
genes_pso <- subset(selected_genes, adj.P.Val.pso <= 0.05 & abs(log2FC.pso>= 0.58))$genes
genes_pso_df<- data_frame(genes_pso)
# Find genes unique to genes
unique_genes_ctrl <- setdiff(genes_ctrl, genes_pso)
unique_genes_pso <- setdiff(genes_pso, genes_ctrl)

##Pseudo bulk only on FOXP3pos cells (plp_FOXP3)---------------------------------------------

plp_FOXP3 <- read_rds(file = here("output", "processing", "plp_FOXP3.rds.gz")) 

pseudo_plp_foxp3<- AggregateExpression(plp_FOXP3, group.by = c("patient","disease","LAYN_pos"), assays = "RNA", slot = "counts")
pseudo_foxp3 <- pseudo_plp_foxp3[['RNA']]
colnames(pseudo_foxp3) <- as.character(colnames(pseudo_foxp3)) # Changes colnames to simple text
enframe(pseudo_foxp3) %>% filter(name=="LAYN") #to view layn 

dge_foxp3 <- DGEList(pseudo_foxp3)
patient <- gsub("_.*","",colnames(pseudo_foxp3))
disease <- gsub("^[^_]*_(.*?)_[^_]*$", "\\1", colnames(pseudo_foxp3), perl = TRUE)
layn_pos_foxp3 <- as.character(gsub(".*_","",colnames(pseudo_foxp3)))
disease_layn_pos_foxp3 <- paste0(disease,"_",layn_pos_foxp3)

design_foxp3 <- model.matrix( ~0 + disease_layn_pos_foxp3+patient)
keep_foxp3 <- filterByExpr(y = dge_foxp3, design = design_foxp3)
dge_foxp3 <- dge_foxp3[keep_foxp3, , keep.lib.sizes = FALSE]
dge_foxp3 <- calcNormFactors(dge_foxp3)
vm_foxp3  <- voom(dge_foxp3, design = design_foxp3, plot = FALSE)
# Coefficients not estimable: patientP05 
# Warning message:
#   Partial NA coefficients for 10294 probe(s) 
fit_foxp3 <- lmFit(vm_foxp3, design = design_foxp3)


contrasts_foxp3 <- makeContrasts(
  PSO_yes_no=disease_layn_pos_foxp3PSO_Yes - disease_layn_pos_foxp3PSO_No,
  CTRL_yes_no=disease_layn_pos_foxp3CTRL_Yes - disease_layn_pos_foxp3CTRL_No, 
  levels=coef(fit_foxp3))

fit_foxp3 <- contrasts.fit(fit_foxp3, contrasts_foxp3)
fit_foxp3 <- eBayes(fit_foxp3)

de_result_pseudo_pso_foxp3 <- topTable(fit_foxp3, coef = "PSO_yes_no", n = Inf, adjust.method = "BH")
de_result_pseudo_pso_foxp3$log2FC <- de_result_pseudo_pso_foxp3$logFC / log(2)
de_result_pseudo_pso_foxp3 <- arrange(de_result_pseudo_pso_foxp3 , adj.P.Val)
de_result_pseudo_ctrl_foxp3 <- topTable(fit_foxp3,coef = "CTRL_yes_no", n = Inf, adjust.method = "BH")
de_result_pseudo_ctrl_foxp3$log2FC <- de_result_pseudo_ctrl_foxp3$logFC / log(2)
de_result_pseudo_ctrl_foxp3 <- arrange(de_result_pseudo_ctrl_foxp3 , adj.P.Val)


p1_foxp3 <- ggplot(de_result_pseudo_pso_foxp3, aes(x=log2FC, y=-log10(P.Value), col=adj.P.Val < 0.05)) +
  geom_point() +
  scale_colour_manual(values=c('TRUE'="red",'FALSE'="black")) + 
  theme_cowplot() +
  ggtitle("Pseudobulk PSO (Volcano)")

significant_pso_genes <- subset(de_result_pseudo_pso_foxp3, adj.P.Val < 0.05 & abs(log2FC) >= 0.58)
# Save as a CSV file
write.csv(significant_pso_genes, file = here("output", "processing", "significant_pso.csv"), row.names = FALSE)

p1_foxp3_filtrated <- ggplot(de_result_pseudo_pso_foxp3, aes(x = log2FC, y = -log10(P.Value))) + 
  geom_point()+
  geom_point(data = significant_pso_genes, aes(col = "Significant"), color = "red") +
  scale_colour_manual(values = c("Significant" = "red", "Not Significant" = "black")) + 
  geom_label_repel(data = as_tibble(significant_pso_genes, rownames = "genes"), aes(label = genes), size = 2, max.overlaps = 10) +
  theme_cowplot() +
  ggtitle("Pseudobulk PSO (Volcano)")
p1_foxp3_filtrated
##to add layer of gene list made for RNAseq analysis
fgsea_leg <- read_csv(file = here("input", "secret_gene_list.csv"))
significant_pso_genes<- rownames_to_column(significant_pso_genes, var="genes")
filtered_significant_genes <- significant_pso_genes[significant_pso_genes$genes %in% fgsea_leg$hgnc_symbol, ]
p1_foxp3_filtrated_selected <- ggplot(de_result_pseudo_pso_foxp3, aes(x = log2FC, y = -log10(P.Value))) + 
  geom_point()+
  geom_point(data = significant_pso_genes, aes(col = "Significant"), color = "red") +
  scale_colour_manual(values = c("Significant" = "red", "Not Significant" = "black")) + 
  geom_label_repel(data= filtered_significant_genes, aes(label = filtered_significant_genes$genes), size = 2) +
  theme_cowplot() +
  ggtitle("Pseudobulk PSO (Volcano)")

#back to analysis and p2= ctrl skin
p2_foxp3 <- ggplot(de_result_pseudo_ctrl_foxp3, aes(x=log2FC, y=-log10(P.Value), col=adj.P.Val < 0.05)) +
  geom_point() +
  scale_colour_manual(values=c('TRUE'="red",'FALSE'="black")) + 
  theme_cowplot() +
  ggtitle("Pseudobulk CTRL (Volcano)")

significant_ctrl_genes <- subset(de_result_pseudo_ctrl_foxp3, adj.P.Val < 0.05 & abs(log2FC) >= 0.58)
p2_foxp3_filtrated <- ggplot(de_result_pseudo_ctrl_foxp3, aes(x = log2FC, y = -log10(P.Value))) + 
  geom_point()+
  geom_point(data = significant_ctrl_genes, aes(col = "Significant"), color = "red") +
  scale_colour_manual(values = c("Significant" = "red", "Not Significant" = "black")) + 
  geom_text_repel(data = as_tibble(significant_ctrl_genes, rownames = "genes"), aes(label = genes), size = 2, max.overlaps = 10) +
  theme_cowplot() +
  ggtitle("Pseudobulk CTRL (Volcano)")
p2_foxp3_filtrated

de_result_joint_foxp3<- full_join(x= as_tibble(de_result_pseudo_ctrl_foxp3, rownames =  "genes"), y=as_tibble(de_result_pseudo_pso_foxp3, rownames = "genes"), suffix = c(".ctrl",".pso"), by= "genes")


selected_genes_foxp3 <- subset(de_result_joint_foxp3, (adj.P.Val.ctrl <= 0.05 & abs(log2FC.ctrl) >= 0.58) | (adj.P.Val.pso <= 0.05 & abs(log2FC.pso) >= 0.58))
selected_genes_foxp3$adj.P.Val_ctrl_pso <- ifelse(selected_genes_foxp3$adj.P.Val.ctrl <= 0.05 | selected_genes_foxp3$adj.P.Val.pso <= 0.05, "Significant", "Not Significant")

# Plot significant and non-significant genes
ggplot(de_result_joint_foxp3, aes(x = log2FC.ctrl, y = log2FC.pso)) +
  # Non-significant genes in black
  geom_point(data = de_result_joint_foxp3[!rownames(de_result_joint_foxp3) %in% rownames(selected_genes_foxp3), ], color = "black", alpha = 0.4) +
  # Significant genes
  geom_point(data = selected_genes_foxp3, aes(color = adj.P.Val_ctrl_pso)) +
  # Vertical and horizontal lines at logFC = 0
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  # Diagonal line
  geom_abline(intercept = 0, slope = 1, linetype = "dotted") +
  # Vertical line at x = 0
  geom_vline(xintercept = 0) +
  # Horizontal line at y = 0
  geom_hline(yintercept = 0) +
  labs(x = "log2FC(pseudo_ctrl)", y = "log2FC(pseudo_pso)", title = "Quadrant Representation of log2FC") +
  geom_label_repel(data = selected_genes_foxp3, aes(label = genes), size = 2, max.overlaps = 40) +
  theme_cowplot()

# Subset genes with adj.P.Val.ctrl < 0.05 and abs(logFC.ctrl) > 1
genes_ctrl_foxp3 <- subset(selected_genes_foxp3, adj.P.Val.ctrl <= 0.05 & abs(log2FC.ctrl >= 0.58))$genes
genes_ctrl_foxp3_df<- data_frame(genes_ctrl_foxp3)
# Subset genes with adj.P.Val.pso < 0.05 and abs(logFC.pso) > 1
genes_pso_foxp3 <- subset(selected_genes_foxp3, adj.P.Val.pso <= 0.05 & abs(log2FC.pso>= 0.58))$genes
genes_pso_foxp3_df<- data_frame(genes_pso_foxp3)
# Find genes unique to genes
unique_genes_ctrl_foxp3 <- setdiff(genes_ctrl_foxp3, genes_pso_foxp3)
unique_genes_pso_foxp3 <- setdiff(genes_pso_foxp3, genes_ctrl_foxp3)
common_genes_foxp3 <- intersect(genes_ctrl_foxp3, genes_pso_foxp3)
common_genes_foxp3_df<- data_frame(common_genes_foxp3)
selected_genes_foxp3$unique_genes_ctrl_foxp3 <- ifelse(selected_genes_foxp3$genes %in% unique_genes_ctrl_foxp3, "Yes", "No")
selected_genes_foxp3$unique_to_pso_foxp3 <- ifelse(selected_genes_foxp3$genes %in% unique_genes_pso_foxp3, "Yes", "No")


#GSEA------------------

# Load MSigDB pathways
msigdb <- msigdbr::msigdbr(species = "Homo sapiens")

# Split MSigDB pathways into GO and KEGG categories
go <-  msigdbr::msigdbr(species = "Homo sapiens", category = "C5") %>% 
  dplyr::filter(grepl("^GO\\:", gs_subcat)) %>%
  split(x = .$gene_symbol, f = .$gs_name) #gs_exact_source if prefer to use revigo after


##GSEA of pseudobluk only PSO-------------------
genes_gsea_pso <- rownames(de_result_pseudo_pso)
logFC_gsea_pso <- abs(de_result_pseudo_pso$logFC)
gene_rank_gsea_pso <- sort(logFC_gsea_pso, decreasing = TRUE)
names(gene_rank_gsea_pso) <- genes_gsea_pso
gsea_go_result_pso <- fgsea(pathways = go, stats = gene_rank_gsea_pso, scoreType = "pos", nPermSimple = 10000, eps = 0)

# Extract top and downregulated pathways
top_go <- head(gsea_go_result_pso[order(gsea_go_result_pso$pval), ], 20)
down_go <- head(gsea_go_result_pso[order(gsea_go_result_pso$pval, decreasing = TRUE), ], 20)
top_go_plot <- ggplot(top_go, aes(x = pval, y = pathway)) +
  geom_point(color = "red") +
  labs(title = "Top 20 Upregulated Pathways (GO)", x = "P-value", y = "Pathway")
down_go_plot <- ggplot(down_go, aes(x = pval, y = pathway)) +
  geom_point(color = "blue") +
  labs(title = "Top 20 Downregulated Pathways (GO)", x = "P-value", y = "Pathway")

#TCR analysis---------------
plp_FOXP3 <- read_rds(file = here("output", "processing", "plp_FOXP3.rds.gz")) 

#Create a new seurat object with only cells that have a TCR read
plp_treg_TCR <- subset(plp_treg, tcr.type == "T-AB")
table(plp_treg_TCR$patient)
Idents(plp_treg_TCR) <- plp_treg_TCR$patient
#Downsample to the lowest number of cells from one donor so the abundance comparison is fair
plp_treg_down <- subset(plp_treg_TCR, downsample = 351) #to downsize with the lowest patient sampling
table(plp_treg_down$patient)

clonalCompare(plp_treg_down[[]] %>% split(.$LAYN_POS), 
                  top.clones = 15,
                  cloneCall="aa", 
                  graph = "alluvial")

clonalHomeostasis(plp_treg_down[[]] %>% split(.$LAYN_POS), 
                  cloneCall = "gene",
                  exportTable = T)

clonalProportion(plp_treg_down[[]] %>% split(.$LAYN_POS), 
                  cloneCall = "gene")

Idents(plp_treg_down)<- plp_treg_down$CTaa
Idents(plp_treg_down) <- factor(x = Idents(plp_treg_down), levels = c("Hyperexpanded (100 < X <= 500)", "Large (20 < X <= 100)","Medium (5 < X <= 20)", "Small (1 < X <= 5)", "Single (0 < X <= 1)"))
